This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
insurance <- read.csv('insurance_cost.csv')
# install.packages("plotly")
# или
# devtools::install_github("ropensci/plotly")
library(plotly)## Warning: пакет 'plotly' был собран под R версии 4.2.2
## Загрузка требуемого пакета: ggplot2
##
## Присоединяю пакет: 'plotly'
## Следующий объект скрыт от 'package:ggplot2':
##
## last_plot
## Следующий объект скрыт от 'package:stats':
##
## filter
## Следующий объект скрыт от 'package:graphics':
##
## layout
skimr::skim(insurance)| Name | insurance |
| Number of rows | 1338 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| smoker | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| region | 0 | 1 | 9 | 9 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 39.21 | 14.05 | 18.00 | 27.00 | 39.00 | 51.00 | 64.00 | ▇▅▅▆▆ |
| bmi | 0 | 1 | 30.66 | 6.10 | 15.96 | 26.30 | 30.40 | 34.69 | 53.13 | ▂▇▇▂▁ |
| children | 0 | 1 | 1.09 | 1.21 | 0.00 | 0.00 | 1.00 | 2.00 | 5.00 | ▇▂▂▁▁ |
| charges | 0 | 1 | 13270.42 | 12110.01 | 1121.87 | 4740.29 | 9382.03 | 16639.91 | 63770.43 | ▇▂▁▁▁ |
str(insurance)## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
summary(insurance)## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
You can also embed plots, for example:
# age histogram
ggplot(data = insurance,
aes(x = age)) +
geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# bmi histogram
plot_ly(
insurance[insurance$bmi != 0,],
x = ~ bmi,
type = 'histogram',
color = 'yellow'
)## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# children histogram
plot_ly(
insurance,
x = ~ children,
type = 'histogram',
color = 'pink'
)## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# charges histogram
plot_ly(
insurance[insurance$charges != 0,],
x = ~ charges,
type = 'histogram',
color = 'brown'
)## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
charge_mean <- round(mean(insurance$charges),1)
charge_median <- round(median(insurance$charges),1)
den <- ggplot(data = insurance,
aes(x = charges)) +
geom_density(color="darkblue", fill="lightblue") +
xlab("Charges") +
ylab("Density") +
geom_vline(aes(xintercept = charge_mean, color = 'blue')) +
annotate("text",
x= charge_mean+10000,
y=0.00001,
label=paste0("Mean=", charge_mean)) +
geom_vline(aes(xintercept = charge_median, color = 'red')) +
annotate("text",
x= charge_median+500,
y=0.00001,
label=paste0("Median=", charge_median)) +
theme_bw() + theme(legend.position="none")#charges and sex
box1 <- ggplot() +
geom_boxplot(data = insurance,
aes(x = sex, y = charges)) +
theme_dark()box2 <- ggplot() +
geom_boxplot(data = insurance,
aes(x = smoker, y = charges)) +
theme_get()box3 <- ggplot() +
geom_boxplot(data = insurance,
aes(x = region, y = charges)) +
theme_grey()library(ggpubr)
combine_plot <- ggarrange(den, ggarrange(box1, box2, box3, ncol = 3, labels = c("B", "C", "D")), nrow = 2, labels = "A") # Указываем формат расположения графиков в виде таблицы, где будет 2 ячейки (для графиков) по колонкам и 1 по строке
combine_plot +
theme_void() +
ggtitle('Characterisation of Charges value')charge_mean <- round(mean(insurance$charges),1)
charge_median <- round(median(insurance$charges),1)
den <- ggplot(data = insurance,
aes(x = charges, group = region)) +
geom_density(color="darkblue", fill="lightblue") +
xlab("Charges") +
ylab("Density") +
geom_vline(aes(xintercept = charge_mean, color = 'blue')) +
annotate("text",
x= charge_mean+10000,
y=0.00001,
label=paste0("Mean=", charge_mean)) +
geom_vline(aes(xintercept = charge_median, color = 'red')) +
annotate("text",
x= charge_median+500,
y=0.00001,
label=paste0("Median=", charge_median)) +
facet_grid(. ~ region) +
theme_bw() + theme(legend.position="none")
deninsurance %>%
ggplot(aes(x=age, y=charges)) +
geom_point(size=3) +
theme(axis.text.x = element_text(size = 14)) +
xlab("age") +
ylab("charges") +
ggtitle('How charges changing with age') + theme_classic()insurance %>%
ggplot(aes(x=age, y=charges, color = smoker, fill = smoker, group = smoker)) +
geom_point(size=3) +
theme(axis.text.x = element_text(size = 14)) +
geom_smooth(method=lm,
color="red", fullrange = T,
fill="#69b3a2",
se=TRUE
) +
xlab("age") +
ylab("charges") +
ggtitle('How charges changing with age') + theme_bw()## `geom_smooth()` using formula 'y ~ x'
insurance %>%
ggplot(aes(x=bmi, y=charges, color = smoker, fill = smoker, group = smoker)) +
geom_point(size=2) +
theme(axis.text.x = element_text(size = 14)) +
geom_smooth(method=lm,
color="red", fullrange = T,
fill="#69b3a2",
se=TRUE
) +
xlab("bmi") +
ylab("charges") +
ggtitle('How charges changing with body mass index') + theme_bw()## `geom_smooth()` using formula 'y ~ x'
А влияет ли курение на индекс массы тела в наших данных?
insurance %>%
ggplot(aes(x=smoker, y=bmi))+ #color = smoker, fill = smoker, group = smoker)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 14)) +
ggtitle('Is there a relationship between bmi and smoking?') +
theme_bw()Как видно из графика статистически значимой разницы по индексу массы тела в группах курящих и не курящих не обнаружено. Посмотрим, есть ли разница у разных полов.
insurance %>%
ggplot(aes(x=smoker, y=bmi, fill = sex))+ #color = smoker, fill = smoker, group = smoker)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 14)) +
ggtitle('Is there a relationship between bmi and smoking related to sex?') +
theme_bw()Значимых различий так же не обнаружено.
В 10-м задании мы увидели сильную зависимость charges от курения и bmi - наблюдается сильная корреляция у курящих с bmi и затратами страховой. Хотелось бы узнать, какие показатели здоровья меняются с возрастом у групп курящих и некурящих.
insurance %>%
ggplot(aes(x=age, y=bmi, color = smoker, fill = smoker, group = smoker)) +
geom_point(size=2) +
theme(axis.text.x = element_text(size = 14)) +
geom_smooth(method=lm,
color="red", fullrange = T,
fill="#69b3a2",
se=TRUE
) +
xlab("age") +
ylab("bmi") +
ggtitle('How bmi changing with age and smoking') + theme_bw()## `geom_smooth()` using formula 'y ~ x'
Никаких различий не обнаружено в группах курящих и некурящих. Попробуем поискать в регионах.
insurance %>%
ggplot(aes(x=bmi, y=age, color = region, fill = region, group = region)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 14)) +
geom_smooth(method=lm,
color="red", fullrange = T,
fill="#69b3a2",
se=TRUE
) +
xlab("age") +
ylab("bmi") +
ggtitle('How bmi changing with age and region') + theme_bw()## `geom_smooth()` using formula 'y ~ x'
Cнова не видим различий в регионах. Может быть они есть затратах на человека, в зависимости от индекса массы тела и региона?
insurance %>%
ggplot(aes(x=bmi, y = charges, color = region, fill = region, group = region)) +
geom_boxplot() +
theme(axis.text.x = element_text(size = 14)) +
geom_smooth(method=lm,
color="red", fullrange = T,
fill="#69b3a2",
se=TRUE
) +
xlab("bmi") +
ylab("charges") +
ggtitle('How bmi changing with age and region') + theme_bw()## `geom_smooth()` using formula 'y ~ x'
Кажется, что да - продвигаясь с северо-запада на юго-восток видим постепенное увеличение затрат с ростом индекса массы тела. Значит, регион проживания влияет на увеличение затрат, но не зависит от абсолютного показателя индекса массы тела.